Let's import the data and take the tables that we need from the schema. Since it's an SQL database, we need to use queries to pull them.
import pandas as pd
import sqlite3
# Read sqlite query results into a pandas DataFrames
conn = sqlite3.connect("database.sqlite")
matches = pd.read_sql_query("SELECT * from Match", conn)
player_atts = pd.read_sql_query("SELECT * from Player_Attributes", conn)
conn.close()
Let's take a look at what we're working with.
matches.head()
print([x for x in matches.columns])
['id', 'country_id', 'league_id', 'season', 'stage', 'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id', 'home_team_goal', 'away_team_goal', 'home_player_X1', 'home_player_X2', 'home_player_X3', 'home_player_X4', 'home_player_X5', 'home_player_X6', 'home_player_X7', 'home_player_X8', 'home_player_X9', 'home_player_X10', 'home_player_X11', 'away_player_X1', 'away_player_X2', 'away_player_X3', 'away_player_X4', 'away_player_X5', 'away_player_X6', 'away_player_X7', 'away_player_X8', 'away_player_X9', 'away_player_X10', 'away_player_X11', 'home_player_Y1', 'home_player_Y2', 'home_player_Y3', 'home_player_Y4', 'home_player_Y5', 'home_player_Y6', 'home_player_Y7', 'home_player_Y8', 'home_player_Y9', 'home_player_Y10', 'home_player_Y11', 'away_player_Y1', 'away_player_Y2', 'away_player_Y3', 'away_player_Y4', 'away_player_Y5', 'away_player_Y6', 'away_player_Y7', 'away_player_Y8', 'away_player_Y9', 'away_player_Y10', 'away_player_Y11', 'home_player_1', 'home_player_2', 'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6', 'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10', 'home_player_11', 'away_player_1', 'away_player_2', 'away_player_3', 'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7', 'away_player_8', 'away_player_9', 'away_player_10', 'away_player_11', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD', 'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA']
115 columns is a little overwhelming. For each of the gambling companies in the dataframe, let's see how they historically favor teams. We're going to look at most of this analysis from the perspective of the home team.
# We're going to keep track of each outfit's record and the margins they're profiting.
stats = dict({
"B365":{"count":0, "right":0, "margins":0},
"BS":{"count":0, "right":0, "margins":0},
"BW":{"count":0, "right":0, "margins":0},
"GB":{"count":0, "right":0, "margins":0},
"IW":{"count":0, "right":0, "margins":0},
"LB":{"count":0, "right":0, "margins":0},
"PS":{"count":0, "right":0, "margins":0},
"SJ":{"count":0, "right":0, "margins":0},
"VC":{"count":0, "right":0, "margins":0},
"WH":{"count":0, "right":0, "margins":0}})
# This one keeps track of "Vegas", the consensus picks
accuracy = {k:{"right":0, "count":0} for k in range(0, 101)}
favs = []
outcomes = []
h_impl = []
a_impl = []
d_impl = []
# For each row (match)
for row in matches.iterrows():
havg = 0
aavg = 0
davg = 0
count = 0
# We're going to add a column with the victorious team - Home, Away, or Draw
if row[1]["home_team_goal"] > row[1]["away_team_goal"]:
outcomes.append("H")
elif row[1]["home_team_goal"] < row[1]["away_team_goal"]:
outcomes.append("A")
elif row[1]["home_team_goal"] == row[1]["away_team_goal"]:
outcomes.append("Draw")
avgmargin = 0
margincount = 0
# For each of the companies
for outfit in stats:
# Get the odds for each outcome and add them to the consensus sum that we will divide to get the average
homecol = outfit + "H"
awaycol = outfit + "A"
drawcol = outfit + "D"
if not pd.isna(row[1][homecol]):
havg += row[1][homecol]
count += 1
if not pd.isna(row[1][awaycol]):
aavg += row[1][awaycol]
if not pd.isna(row[1][drawcol]):
davg += row[1][drawcol]
# Then calculate if the individual company was right or not
if row[1][homecol] < row[1][awaycol] and row[1][homecol] < row[1][drawcol]:
stats[outfit]["count"] += 1
if outcomes[-1] == "H":
stats[outfit]["right"] += 1
elif row[1][awaycol] < row[1][homecol] and row[1][awaycol] < row[1][drawcol]:
stats[outfit]["count"] += 1
if outcomes[-1] == "A":
stats[outfit]["right"] += 1
elif row[1][drawcol] < row[1][homecol] and row[1][drawcol] < row[1][awaycol]:
stats[outfit]["count"] += 1
if outcomes[-1] == "Draw":
stats[outfit]["right"] += 1
# And calculate the margins
margincount += 1
margin = 1/row[1][homecol] + 1/row[1][awaycol] + 1/row[1][drawcol]
avgmargin += margin
stats[outfit]["margins"] += margin
if count > 0:
# Calculate the average margin and the implied probability based on the odds.
avgmargin = avgmargin/margincount
h_impl.append(int(100/(avgmargin*(havg/(count)))))
a_impl.append(int(100/(avgmargin*(aavg/(count)))))
d_impl.append(int(100/(avgmargin*(davg/(count)))))
# Add a column for the industry favorite
if havg/(count) < aavg/(count) and havg/(count) < davg/(count):
favs.append("H")
elif aavg/(count) < havg/(count) and aavg/(count) < davg/(count):
favs.append("A")
else:
favs.append("Draw")
else:
h_impl.append(0)
a_impl.append(0)
d_impl.append(0)
favs.append("-")
# Add the columns that we were iteratively building
matches["favorite"] = favs
matches["outcome"] = outcomes
matches["h_prob"] = h_impl
matches["a_prob"] = a_impl
matches["d_prob"] = d_impl
# Drop rows with missing odds
matches.drop(matches[matches.favorite == "-"].index, inplace=True)
Now that we calculated all those statistics, let's interpret.
for outfit in stats:
print(outfit, 100*stats[outfit]["right"]/stats[outfit]["count"], stats[outfit]["margins"]/stats[outfit]["count"])
B365 53.46627408993576 1.0698651340739223 BS 53.431022158684776 1.0965238891774722 BW 53.39348235927821 1.0948602564884289 GB 53.45444142468889 1.097378059231999 IW 53.868855482799965 1.1414771061896793 LB 53.44501486084842 1.103940405252856 PS 52.788504714863045 1.026830014359718 SJ 53.58222222222222 1.0831567724728688 VC 53.469424299400124 1.0635240215982187 WH 53.577235772357724 1.1012188431274956
So most companies are only right ~53% of the time! You may be thinking, "I can guess more than 53% of soccer winners right! Pshh!". The second number, the margins, is where they get you. To convert from decimal odds, which we have here, to implied probability, we divide 1 by the odds.
For example, you place 5 dollars on Manchester City to beat Liverpool at 1.5 odds. This means: If City wins, you walk away with $7.50 in your pocket (including the original 5). If City loses, you lose the 5 dollars altogether. The implied probability in this case is 1/1.5 = 0.66666666666. That leaves only 0.33333... for both of the other outcomes: a Liverpool win and a draw. What these companies will do, is set Liverpool's odds such that the implied probability is 0.23 and a draw is 0.12. If we add these all up, we get more than 1. And this is the margin. Companies offer lower odds and therefore lower payouts by implying that each team has a better chance to win than what is considered fair.
avgmargin
1.0513674973503548
Let's calibrate the accuracy of the industry favorites vs actual winners.
for row in matches.iterrows():
accuracy[row[1]["h_prob"]]["count"] += 1
accuracy[row[1]["a_prob"]]["count"] += 1
accuracy[row[1]["d_prob"]]["count"] += 1
if row[1]["outcome"] == "H":
accuracy[row[1]["h_prob"]]["right"] += 1
elif row[1]["outcome"] == "A":
accuracy[row[1]["h_prob"]]["right"] += 1
elif row[1]["outcome"] == "Draw":
accuracy[row[1]["d_prob"]]["right"] += 1
We make a scatterplot of predicted win probability vs relative win percentage.
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(specs=[[{"secondary_y": True}]])
ratios = []
x = []
counts = []
for k in accuracy:
if accuracy[k]["count"] != 0:
x.append(k)
counts.append(accuracy[k]["count"])
ratios.append(100*accuracy[k]["right"]/accuracy[k]["count"])
fig.add_trace(go.Scatter(x=x, y=ratios, name="Bookies"))
fig.add_trace(go.Scatter(x=list(range(0,101)), y=list(range(0,101)), name="Expected"))
fig.add_trace(go.Bar(x=x, y=counts, opacity=0.2, name="Occurences"), secondary_y=True,)
fig.update_layout(title="Predicted vs Actual Win Probability", xaxis_title="Implied Probability",
yaxis_title="Actual Probability", height=600, width=800, yaxis_range=[0,100])
fig.update_yaxes(title_text="Count", showgrid=False, secondary_y=True)
fig.show()
Those discrepancies between the red and blue line are what we look to exploit. The so-called "value"
What does the distribution of outcomes look like compared to favored?
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Histogram(x=matches["favorite"], name="Bookie Favorite"))
fig.add_trace(go.Histogram(x=matches["outcome"], name="Outcome"))
fig.update_layout(title="Implied Favorites vs Outcome", xaxis_title="Favorite vs Outcome", yaxis_title="Count",
height=600, width=600, yaxis_range=[0,17000])
fig.show()
So rarely are the best odds on a match for a tie outcome. They occur way more often than they are made the favorite.
Here's where things can get interesting. What if we can use the attributes of the real-life players in a virtual game to predict the outcome of real-life matches? That's the idea of the rest of this analysis. My hypothesis is that as the difference in average rating goes up, a team is more likely to win.
First we need to join the dataframes of player attributes and matches. For each row in matches (1 match), there are 22 columns for the 22 players that start the match. We want to find the average rating of the players listed for each team. A few merges would do the trick, but the problem is that for each player in the player attributes table, there are multiple iterations of ratings that are updated over time. It would make sense to take the most recent rating for a player before a match, so that is what we do here.
# player_df will be the merged dataframe with all the ratings. We need to rename so columns so the merges can be looped
player_atts.rename(columns={'player_api_id': 'home_player_0'}, inplace=True)
player_df = matches
player_df.rename(columns={f'date': 'date_x'}, inplace=True)
# Here we merge the matches and attributes dfs for each player on the home team on their id
for player_no in range(1, 12):
player_atts.rename(columns={f'home_player_{int(player_no)-1}': f'home_player_{player_no}'}, inplace=True)
player_df = player_df.merge(player_atts[[f"home_player_{player_no}", "date", "overall_rating"]],
how="left", on=f'home_player_{player_no}')
player_df.rename(columns={f'overall_rating': f'home_ovr{player_no}'}, inplace=True)
player_df.rename(columns={f'date': f'date_{player_no}'}, inplace=True)
# We want only ratings that are before the match date
player_df = player_df[player_df["date_x"] > player_df[f'date_{player_no}']]
# To get the most recent, we sort by rating date and drop duplicates, keeping only the most recent
player_df = player_df.sort_values(f'date_{player_no}', ascending=False).drop_duplicates(["match_api_id"])
player_df = player_df.dropna(subset=[f'home_ovr{player_no}'])
player_df = player_df.drop([f'date_{player_no}'], axis=1)
# And do the same for the away players
player_atts.rename(columns={'home_player_11': 'away_player_0'}, inplace=True)
for player_no in range(1, 12):
player_atts.rename(columns={f'away_player_{int(player_no)-1}': f'away_player_{player_no}'}, inplace=True)
player_df = player_df.merge(player_atts[[f"away_player_{player_no}", "date", "overall_rating"]],
how="left", on=f'away_player_{player_no}')
player_df.rename(columns={f'overall_rating': f'away_ovr{player_no}'}, inplace=True)
player_df.rename(columns={f'date': f'date_{player_no}'}, inplace=True)
player_df = player_df[player_df["date_x"] > player_df[f'date_{player_no}']]
player_df = player_df.sort_values(f'date_{player_no}', ascending=False).drop_duplicates(["match_api_id"])
player_df = player_df.dropna(subset=[f'away_ovr{player_no}'])
player_df = player_df.drop([f'date_{player_no}'], axis=1)
# We'll lastly drop these columns that we don't care about anymore
player_df = player_df.drop(['stage', 'home_player_X1', 'home_player_X2', 'home_player_X3', 'home_player_X4', 'home_player_X5',
'home_player_X6', 'home_player_X7', 'home_player_X8', 'home_player_X9', 'home_player_X10',
'home_player_X11', 'away_player_X1', 'away_player_X2', 'away_player_X3', 'away_player_X4',
'away_player_X5', 'away_player_X6', 'away_player_X7', 'away_player_X8', 'away_player_X9',
'away_player_X10', 'away_player_X11', 'home_player_Y1', 'home_player_Y2', 'home_player_Y3',
'home_player_Y4', 'home_player_Y5', 'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
'home_player_Y9', 'home_player_Y10', 'home_player_Y11', 'away_player_Y1', 'away_player_Y2',
'away_player_Y3', 'away_player_Y4', 'away_player_Y5', 'away_player_Y6', 'away_player_Y7',
'away_player_Y8', 'away_player_Y9', 'away_player_Y10', 'away_player_Y11', 'home_player_1',
'home_player_2', 'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6',
'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10', 'home_player_11',
'away_player_1', 'away_player_2', 'away_player_3', 'away_player_4', 'away_player_5',
'away_player_6', 'away_player_7', 'away_player_8', 'away_player_9', 'away_player_10',
'away_player_11', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross',
'corner', 'possession', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA',
'LBH', 'LBD', 'LBA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH',
'VCD', 'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA',], axis=1)
player_df.columns
Index(['id', 'country_id', 'league_id', 'season', 'date_x', 'match_api_id',
'home_team_api_id', 'away_team_api_id', 'home_team_goal',
'away_team_goal', 'favorite', 'outcome', 'h_prob', 'a_prob', 'd_prob',
'home_ovr1', 'home_ovr2', 'home_ovr3', 'home_ovr4', 'home_ovr5',
'home_ovr6', 'home_ovr7', 'home_ovr8', 'home_ovr9', 'home_ovr10',
'home_ovr11', 'away_ovr1', 'away_ovr2', 'away_ovr3', 'away_ovr4',
'away_ovr5', 'away_ovr6', 'away_ovr7', 'away_ovr8', 'away_ovr9',
'away_ovr10', 'away_ovr11'],
dtype='object')
Now that we have that data together, let's investigate how average rating affects the home team's win percentage.
import plotly.graph_objects as go
from collections import Counter
# Column names
h_overalls = ['home_ovr1', 'home_ovr2', 'home_ovr3', 'home_ovr4', 'home_ovr5',
'home_ovr6', 'home_ovr7', 'home_ovr8', 'home_ovr9', 'home_ovr10',
'home_ovr11']
a_overalls = ['away_ovr1', 'away_ovr2', 'away_ovr3', 'away_ovr4',
'away_ovr5', 'away_ovr6', 'away_ovr7', 'away_ovr8', 'away_ovr9',
'away_ovr10', 'away_ovr11']
# Get the average ratings for home, away, and the differences
player_df['h_avg_rat'] = player_df.iloc[:, -22:-11].sum(axis=1)/11
player_df['a_avg_rat'] = player_df.iloc[:, -12:-1].sum(axis=1)/11
player_df['avg_rat_dif'] = player_df['h_avg_rat'] - player_df['a_avg_rat']
features = ['h_avg_rat', 'a_avg_rat', 'avg_rat_dif']
fig = make_subplots(rows=1, cols=3, subplot_titles=features)
row = 1
col = 1
# Plot average rating against relative win percentage
for feature in features:
x0 = player_df[player_df["home_team_goal"] > player_df['away_team_goal']][feature]
x1 = player_df[player_df["home_team_goal"] < player_df['away_team_goal']][feature]
x2 = player_df[player_df["home_team_goal"] == player_df['away_team_goal']][feature]
master = dict()
i = 0
for lst in [x0, x1, x2]:
labels = ["win", "loss", "draw"]
ctr = Counter(lst)
for x in ctr:
if x in master:
master[x]["total"] += ctr[x]
master[x][labels[i]] = ctr[x]
else:
master.update({x: {"total":0, "win":0, "loss":0, "draw":0}})
master[x]["total"] += ctr[x]
master[x][labels[i]] = ctr[x]
i += 1
y0 = []
y1 = []
y2 = []
for x in master:
y0.append(master[x]["win"]/master[x]["total"])
y1.append(master[x]["loss"]/master[x]["total"])
y2.append(master[x]["draw"]/master[x]["total"])
fig.add_trace(go.Scatter(name=f'{feature} Win', x=list(master.keys()), y=y0, mode="markers",
marker_color="red"), row=row, col=col)
fig.add_trace(go.Scatter(name=f'{feature} Loss', x=list(master.keys()), y=y1, mode="markers",
marker_color="blue"), row=row, col=col)
fig.add_trace(go.Scatter(name=f'{feature} Draw', x=list(master.keys()), y=y2, mode="markers",
marker_color="grey"), row=row, col=col)
fig.update_layout(title=feature)
col += 1
fig.update_layout(height=600, width=800,title_text="Feature Analysis with Relative Win Percentage")
fig.show()
This is certainly interesting. We see how higher ratings affect win percentages and the noise that makes it so hard to predict. This looks like a problem for a classifier.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# We'll use only home and away average ratings to predict the outcome
X = player_df[features[0:2]]
y = player_df[["outcome", "h_prob", "a_prob", "d_prob"]]
# We need to scale the output so models are normalized
scaler = StandardScaler()
scaled = scaler.fit_transform(X)
# And we split the data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(scaled, y, test_size=0.33, random_state=42)
Now we'll define the functions to evaluate models and select the best one.
# Find the profit if you were to bet $5 on every prediction
def findProfit(outcomes, predictions):
right = 0
count = 0
outcomes
idx = {
"H":"h_prob",
"A":"a_prob",
"Draw":"d_prob",
}
profit = 0
for i in range(0, len(predictions)):
count += 1
if predictions[i] == outcomes.iloc[i]["outcome"]:
profit = (5*(1/(outcomes.iloc[i][idx[predictions[i]]]/100))) - 5
right += 1
else:
profit -= 5
return(profit)
# Find the accuracy and print the evaluation for each model
def testModels(x1, y1, x2, y2):
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.metrics import accuracy_score
# Initialize
models = []
gnb = GaussianNB()
KNN = KNeighborsClassifier(n_neighbors=1)
BNB = BernoulliNB()
LR = LogisticRegression()
SDG = SGDClassifier()
SVC = SVC()
LSVC = LinearSVC()
NSVC = NuSVC()
# For each of these models, we fit the data and make predictions. The accuracy and profit of each is printed.
gnb.fit(x1, y1["outcome"])
models.append(gnb.predict(x2))
print("GaussianNB Accuracy :", accuracy_score(y2["outcome"], models[-1]))
print("GaussianNB Profit :", findProfit(y2, models[-1]))
KNN.fit(x1,y1["outcome"])
models.append(KNN.predict(x2))
print("KNN Accuracy :", accuracy_score(y2["outcome"], models[-1]))
print("KNN Profit :", findProfit(y2, models[-1]))
BNB.fit(x1,y1["outcome"])
models.append(BNB.predict(x2))
print("BNB Accuracy :", accuracy_score(y2["outcome"], models[-1]))
print("BNB Profit :", findProfit(y2, models[-1]))
LR.fit(x1,y1["outcome"])
models.append(LR.predict(x2))
print("LR Accuracy :", accuracy_score(y2["outcome"], models[-1]))
print("LR Profit :", findProfit(y2, models[-1]))
SDG.fit(x1,y1["outcome"])
models.append(SDG.predict(x2))
print("SDG Accuracy :", accuracy_score(y2["outcome"], models[-1]))
print("SDG Profit :", findProfit(y2, models[-1]))
SVC.fit(x1,y1["outcome"])
models.append(SVC.predict(x2))
print("SVC Accuracy :", accuracy_score(y2["outcome"], models[-1]))
print("SVC Profit :", findProfit(y2, models[-1]))
LSVC.fit(x1,y1["outcome"])
models.append(LSVC.predict(x2))
print("LSVC Accuracy :", accuracy_score(y2["outcome"], models[-1]))
print("LSVC Profit :", findProfit(y2, models[-1]))
NSVC.fit(x1,y1["outcome"])
models.append(NSVC.predict(x2))
print("NSVC Accuracy :", accuracy_score(y2["outcome"], models[-1]))
print("NSVC Profit :", findProfit(y2, models[-1]))
return models
And test!
models = testModels(X_train, y_train, X_test, y_test)
GaussianNB Accuracy : 0.5054263565891473 GaussianNB Profit : -3.0555555555555554 KNN Accuracy : 0.4102325581395349 KNN Profit : -10.714285714285714 BNB Accuracy : 0.49007751937984495 BNB Profit : -3.0555555555555554 LR Accuracy : 0.5234108527131783 LR Profit : -3.0555555555555554 SDG Accuracy : 0.5100775193798449 SDG Profit : -3.0555555555555554 SVC Accuracy : 0.5187596899224807 SVC Profit : -3.0555555555555554 LSVC Accuracy : 0.5231007751937985 LSVC Profit : -3.0555555555555554 NSVC Accuracy : 0.39891472868217054 NSVC Profit : -3.0555555555555554
Well we aren't too far off! But... all the models are in the red slightly. The best appears to be LogisticRegression.
What if we just use the difference as a predictor?
import numpy as np
# We'll use only home and away average ratings to predict the outcome
X = np.asarray(player_df[features[2]]).reshape(-1, 1)
y = player_df[["outcome", "h_prob", "a_prob", "d_prob"]]
# We need to scale the output so models are normalized
scaler = StandardScaler()
scaled = scaler.fit_transform(X)
# And we split the data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(scaled, y, test_size=0.33, random_state=42)
models = testModels(X_train, y_train, X_test, y_test)
GaussianNB Accuracy : 0.5209302325581395 GaussianNB Profit : -3.0555555555555554 KNN Accuracy : 0.4243410852713178 KNN Profit : -3.0555555555555554 BNB Accuracy : 0.4967441860465116 BNB Profit : -3.0555555555555554 LR Accuracy : 0.5210852713178294 LR Profit : -3.0555555555555554 SDG Accuracy : 0.5204651162790698 SDG Profit : -3.0555555555555554 SVC Accuracy : 0.521860465116279 SVC Profit : -3.0555555555555554 LSVC Accuracy : 0.5212403100775194 LSVC Profit : -3.0555555555555554 NSVC Accuracy : 0.31069767441860463 NSVC Profit : -5.294117647058824
It got worse -_- This perhaps suggests that my hypothesis is wrong. Perhaps teams' ratings are independent? When you think about this idea, it makes sense. A team's quality, is NOT on the opponent's quality. Their performance may be, but it is also just as dependent on them. Although we weren't able to outpredict Vegas, we got pretty close. Modern day oddsmakers will likely use machine learning to keep their numbers profitable. Even though it's impossible to be perfect, being as perfect in prediction as possible gives you the biggest advantage as a company because there is little value for customers to exploit. But that's what makes the game fun, and addictive. No matter the power of Goliath, there is always that small belief in each person who considers themself David, armed with historical datasets and machine learning models.